# Bank level variables:
# -- size
# -- specialization by industry
# -- average size on country level

# use instr_lvl_mb_dt_f.dta
# map results to dbtr_lvl_dt_f.dta and instr_lvl_dt_f.dta to run the regressions
# export debtor level and instrument level data sets
# export country level charts

import pandas as pd
import numpy as np
# supress scientific notation
pd.options.display.float_format ='{:.6f}'.format

# paths
path_data = r'P:\ECB business areas\DGR\Staff\papouts\Research\Active\Firm Heterogeneities\Final datasets'
path_data_nuts = r'P:\ECB business areas\DGR\Staff\papouts\Research\Active\Firm Heterogeneities\Final datasets\NUTS output data'
path_output = r'P:\ECB business areas\DGR\Staff\papouts\Research\Active\Firm Heterogeneities\Final charts\Revision'


# load data
dt = pd.read_stata(path_data + r'\instr_lvl_mb_dt_f.dta')
print(dt.columns)

# apply filters
dt.drop(dt[dt.nace_code == ''].index, inplace=True)
sel_countries = ['CY', 'EE', 'LT', 'LU', 'LV', 'MT', 'SI', 'SK']
dt.drop(dt[dt.cntry_dbtr.isin(sel_countries)].index, inplace=True)

##### Construct bank-related variables                      
# construct bank size
# creditor level
b_size = dt.groupby(['cntry_dbtr','crdtr_id'])['outst_amnt_crdtr_loan'].sum().rename('ONA_bank_ttl').reset_index()
b_size['log_ONA_bank_ttl'] = np.log(b_size['ONA_bank_ttl'])
# firm level size
dt['ONA_weight'] = dt['outst_amnt_per_crdtr'].div(dt['outst_amnt_ttl'])
dt = dt.merge(b_size,how='left',on=['cntry_dbtr','crdtr_id'])
dt['size_w'] = dt['ONA_weight']*dt['log_ONA_bank_ttl']
i_size = dt[['cntry_dbtr','dbtr_id','size_w']].drop_duplicates().groupby(['cntry_dbtr','dbtr_id'])['size_w'].sum().rename('i_size').reset_index()
dt = dt.merge(i_size,how='left',on=['cntry_dbtr','dbtr_id'])

# bank specialization
# country level
ona_c = dt.groupby(['cntry_dbtr'])['outst_amnt_byinstr'].sum().rename('ona_c').reset_index()
# instrument-country level
ona_ck = dt.groupby(['cntry_dbtr','nace_code'])['outst_amnt_byinstr'].sum().rename('ona_ck').reset_index()
# bank level
ona_bc = dt.groupby(['cntry_dbtr','crdtr_id'])['outst_amnt_crdtr_loan'].sum().rename('ona_bc').reset_index()
# instrument-bank level
ona_bkc = dt.groupby(['cntry_dbtr','crdtr_id','nace_code'])['outst_amnt_crdtr_loan'].sum().rename('ona_bkc').reset_index()
  
# merge and calculate specialization on creditor-sector level
b_spec = ona_bkc.merge(ona_bc,how='left',on=['cntry_dbtr','crdtr_id']) 
b_spec = b_spec.merge(ona_ck,how='left',on=['cntry_dbtr','nace_code']) 
b_spec = b_spec.merge(ona_c,how='left',on=['cntry_dbtr']) 
b_spec['bkc_spec'] = b_spec['ona_bkc'].div(b_spec['ona_bc'])
b_spec['ck_spec'] = b_spec['ona_ck'].div(b_spec['ona_c'])
b_spec['b_spec_diff'] = b_spec['bkc_spec'] - b_spec['ck_spec']

# merge with main data set
dt = dt.merge(b_spec,how='left',on=['cntry_dbtr','crdtr_id','nace_code']) 
# winsorize 0.01 and 0.99 percentiles
# check overall to winsorize
print(dt[['cntry_dbtr','crdtr_id','b_spec_diff']].drop_duplicates()['b_spec_diff'].quantile([0.01,0.5,0.95,0.99,0.999]))
# calculate thresholds
dt['b_spec_diff_orig'] = dt['b_spec_diff'] # keep original values
# calculate 0.01 percentile
perc_diff_01 = dt[['cntry_dbtr','crdtr_id','nace_code','b_spec_diff']].drop_duplicates()['b_spec_diff'].quantile([0.01]).rename('b_spec_diff_01')
# calculate 0.99 percentile
perc_diff_99 = dt[['cntry_dbtr','crdtr_id','nace_code','b_spec_diff']].drop_duplicates()['b_spec_diff'].quantile([0.99]).rename('b_spec_diff_99')
# apply rule
dt.loc[dt.b_spec_diff_orig < perc_diff_01.iloc[0], 'b_spec_diff'] = perc_diff_01.iloc[0]
dt.loc[dt.b_spec_diff_orig > perc_diff_99.iloc[0], 'b_spec_diff'] = perc_diff_99.iloc[0]
del perc_diff_01, perc_diff_99

# difference between nominator and denominator
dt['b_spec_diff_w'] = dt['b_spec_diff']*dt['ONA_weight'] 
# sum b_spec_w on debtor level and merge this with final data sets
b_spec_diff_d = dt.groupby(['cntry_dbtr','dbtr_id'])['b_spec_diff_w'].sum().reset_index().rename(columns={'b_spec_diff_w':'b_spec_diff_dbtr'})
# merge with main data set
dt = dt.merge(b_spec_diff_d,how='left',on=['cntry_dbtr','dbtr_id']) 

# check summary statistics
print(dt['b_spec_diff_dbtr'].describe())
    

##### Assortative matching
# firms - micro+small and mid-large -> 2 categories
dt['firms_class'] = 'small'
dt.loc[dt.sz_f == 2, 'firms_class'] = 'mid-large'
dt.loc[dt.sz_f == 1, 'firms_class'] = 'mid-large'
# check
print(dt[['sz_f','firms_class']].drop_duplicates())

# banks - use Total outstanding amount
# get the median (in terms of ONA) by country and classify the banks
# first, calculate shares - banks with shares above 50% should be large
totals_cntry = b_size.groupby(['cntry_dbtr'])['ONA_bank_ttl'].sum().reset_index().rename(columns={'ONA_bank_ttl':'cntry_ttl'})
b_size = b_size.merge(totals_cntry,how='left',on='cntry_dbtr')
b_size['ttl_lending_shares'] = b_size['ONA_bank_ttl'].div(b_size['cntry_ttl'])
# sort shares within country
b_size.sort_values(['cntry_dbtr','ttl_lending_shares'],inplace=True)
# calculate cumulative frequency by country
b_size['cum_freq'] = b_size.groupby(['cntry_dbtr'])['ttl_lending_shares'].cumsum()
# calculate minimum difference between shares
b_size['cum_freq_diff'] = 1 - b_size['cum_freq']
b_size['freq_abs_diff'] = abs(b_size['cum_freq'] - b_size['cum_freq_diff'])
# select the minimum value
min_diff = b_size.groupby(['cntry_dbtr'])['freq_abs_diff'].min().reset_index().rename(columns={'freq_abs_diff':'min_diff'})
b_size = b_size.merge(min_diff, how='left',on='cntry_dbtr')
b_size['cut_off'] = b_size.loc[b_size.freq_abs_diff == b_size.min_diff,'cum_freq']
cut_off = b_size.groupby(['cntry_dbtr'])['cut_off'].min().reset_index().rename(columns={'cut_off':'cut_off_val'})
b_size = b_size.merge(cut_off, how='left', on='cntry_dbtr')
# classify
b_size['banks_class'] = 'small'
b_size.loc[(b_size.cum_freq > b_size.cut_off_val), 'banks_class'] = 'large'
# check
print(b_size[['cntry_dbtr','banks_class']].drop_duplicates().sort_values(by='cntry_dbtr'))
t1 = b_size.groupby(['cntry_dbtr','banks_class'])['ONA_bank_ttl'].sum().reset_index()
t1 = t1.merge(b_size[['cntry_dbtr','cntry_ttl']].drop_duplicates(), how='left',on=['cntry_dbtr'])
t1['shares'] = t1['ONA_bank_ttl'].div(t1['cntry_ttl'])
print(t1)
# map to main data set
dt = dt.merge(b_size[['cntry_dbtr', 'crdtr_id', 
                     'ttl_lending_shares', 'cum_freq', 'banks_class']],how='left',on=['cntry_dbtr','crdtr_id'])
# save classification
b_size.to_stata(path_data + r'\b_size_classification.dta', write_index=False)


# lending to SMEs and Large
# shares
# calculate total outstanding amounts by bank class
ona_ttl = dt.groupby(['cntry_dbtr','banks_class'])['outst_amnt_crdtr_loan'].sum().reset_index().rename(columns={'outst_amnt_crdtr_loan':'ona_ttl'})
# calculate totals by groups
ona_firm_bank = dt.groupby(['cntry_dbtr','firms_class','banks_class'])['outst_amnt_crdtr_loan'].sum().reset_index().rename(columns={'outst_amnt_crdtr_loan':'ona_firm_bank'})
# merge
ona_firm_bank = ona_firm_bank.merge(ona_ttl, how='left',on=['cntry_dbtr','banks_class'])
# calculate shares
ona_firm_bank['shares_credit'] = ona_firm_bank['ona_firm_bank'].div(ona_firm_bank['ona_ttl'])
# save data set
ona_firm_bank.to_stata(path_data + r'\shares_credit.dta', write_index=False)

# for the table
# reshape table
ona_pivot = ona_firm_bank.pivot_table(index=['cntry_dbtr'], columns=['banks_class','firms_class'], values='shares_credit', aggfunc=np.sum)
# round values to the second digit
ona_pivot = ona_pivot.round(2)
ona_pivot = ona_pivot.apply(lambda x: x.map('{:,.2f}'.format))
# reindex
ona_pivot = ona_pivot.reindex(['small','large'], axis=1, level=0) # bank size
ona_pivot = ona_pivot.reindex(['small','mid-large'], axis=1, level=1) # firm size

# save in excel
ona_pivot.to_excel(path_output + r'\shares_credit_table.xlsx')
# save to tex file
tex_table = ona_pivot.to_latex( 
                       column_format='lccccc', 
                       multicolumn_format='c',
                       caption='Lending of small and large banks to firms by firm size classes')
print(tex_table)
ona_pivot.to_latex(path_output + r'\shares_credit_table.tex', 
                       column_format='lccccc', 
                       multicolumn_format='c',
                       caption='Lending of small and large banks to firms by firm size classes')

# main bank shares
# we need only the main banks
mb_dt = dt.loc[dt.mb_flag == 1][['cntry_dbtr','dbtr_id','crdtr_id','firms_class','banks_class']].drop_duplicates()
# count total firms by groups
count_ttl = mb_dt.groupby(['cntry_dbtr','firms_class'])['dbtr_id'].count().reset_index().rename(columns={'dbtr_id':'count_ttl'})
# count firms by banks
count_firm_bank = mb_dt.groupby(['cntry_dbtr','firms_class','banks_class'])['dbtr_id'].count().reset_index().rename(columns={'dbtr_id':'count_firm_bank'})
# merge
count_firm_bank = count_firm_bank.merge(count_ttl, how='left',on=['cntry_dbtr','firms_class'])
# calculate shares
count_firm_bank['shares_count'] = count_firm_bank['count_firm_bank'].div(count_firm_bank['count_ttl'])
# save data set
count_firm_bank.to_stata(path_data + r'\shares_count.dta', write_index=False)

# count all firms by banks
count_all_bybank = mb_dt.groupby(['cntry_dbtr','banks_class'])['dbtr_id'].count().reset_index().rename(columns={'dbtr_id':'count_all_bybank'})
# count all firms
count_all = mb_dt.groupby(['cntry_dbtr'])['dbtr_id'].count().reset_index().rename(columns={'dbtr_id':'count_all'})
# merge 
count_all_bybank = count_all_bybank.merge(count_all, how='left',on=['cntry_dbtr'])
# calculate shares
count_all_bybank['shares_all'] = count_all_bybank['count_all_bybank'].div(count_all_bybank['count_all'])
# we can get these numbers from the shares_count, therefore no need to save an additional data set

# for the table
# reshape table
count_pivot = count_firm_bank.pivot_table(index=['cntry_dbtr'], columns=['banks_class','firms_class'], values='shares_count', aggfunc=np.sum)
# round values to the second digit
count_pivot = count_pivot.round(2)
count_pivot = count_pivot.apply(lambda x: x.map('{:,.2f}'.format))
# reindex
count_pivot = count_pivot.reindex(['small','large'], axis=1, level=0) # bank size
count_pivot = count_pivot.reindex(['small','mid-large'], axis=1, level=1) # firm size

# reshape table
count_all_pivot = count_all_bybank.pivot_table(index=['cntry_dbtr'], columns=['banks_class'], values='shares_all', aggfunc=np.sum)
# round values to the second digit
count_all_pivot = count_all_pivot.round(2)
count_all_pivot = count_all_pivot.apply(lambda x: x.map('{:,.2f}'.format))
# reindex
count_all_pivot = count_all_pivot[['small','large']]

# merge the tables
#count_pivot_merged = count_all_pivot.merge(count_pivot, how='left', left_index=True, right_index=True)
count_pivot_merged = pd.concat([count_all_pivot, count_pivot], axis=1)

# new index
count_pivot_merged['small','all'] = count_pivot_merged.iloc[:,0]
# drop redundant columns
count_pivot_merged.drop(['small','large'], axis=1, inplace=True)
# rearrange
count_pivot_merged = count_pivot_merged[[('small', 'all'), ('small', 'small'),('small', 'mid-large'),('large', 'small'),('large', 'mid-large')]]
# save to excel
count_pivot_merged.to_excel(path_output + r'\shares_count_table.xlsx')

# save to tex file
tex_table = count_pivot_merged.to_latex( 
                       column_format='lcccccc', 
                       multicolumn_format='c',
                       caption='Share of firms for which a S or a L bank is the main bank, by firm size')
print(tex_table)
count_pivot_merged.to_latex(path_output + r'\shares_count_table.tex', 
                       column_format='lcccccc', 
                       multicolumn_format='c',
                       caption='Share of firms for which a S or a L bank is the main bank, by firm size')


##### MERGE AND EXPORT

# load final data sets
dt_f_d = pd.read_stata(path_data+r'\dbtr_lvl_dt_f.dta')
dt_f_i = pd.read_stata(path_data+r'\instr_lvl_dt_f.dta')

# merge nuts codes
# HHI NUTS3
# load data set - only one is needed, the NUTS codes are all on debtor level
dbtr_nuts = pd.read_stata(path_data+r'\dbtr_lvl_dt_nuts.dta')
dt_f_d = dt_f_d.merge(dbtr_nuts[['cntry_dbtr', 'dbtr_id','nuts3_dbtr_final','hhi_nuts3']],how='left',on=['cntry_dbtr', 'dbtr_id'])
dt_f_i = dt_f_i.merge(dbtr_nuts[['cntry_dbtr', 'dbtr_id','nuts3_dbtr_final','hhi_nuts3']],how='left',on=['cntry_dbtr', 'dbtr_id'])
del dbtr_nuts

# merge bank-related variables on debtor level
# i_size
dt_f_d = dt_f_d.merge(i_size,how='left',on=['cntry_dbtr', 'dbtr_id'])
dt_f_i = dt_f_i.merge(i_size,how='left',on=['cntry_dbtr', 'dbtr_id'])
# b_spec_diff_d
dt_f_d = dt_f_d.merge(b_spec_diff_d,how='left',on=['cntry_dbtr', 'dbtr_id'])
dt_f_i = dt_f_i.merge(b_spec_diff_d,how='left',on=['cntry_dbtr', 'dbtr_id'])

     
# export
# includes banks and variables on bank level
dt.to_stata(path_data + r'\instr_bank_level_dt.dta', write_index=False)
# debtor level
dt_f_d.to_stata(path_data + r'\dbtr_level_b_dt.dta', write_index=False)
# instrument level
dt_f_i.to_stata(path_data + r'\instr_level_b_dt.dta', write_index=False)





